# LOAD NECESSARY PACKAGES ------------------------------------------------------

library(pacman)

# tidyverse
p_load(dplyr, tidyr, stringr, ggplot2, forcats, purrr, tidylog)

# misc
p_load(here, patchwork, readstata13, naniar, WDI)

# custom 
# sum with na.rm = T unless ALL NA
suma = function(x) if (all(is.na(x))) x[NA_integer_] else sum(x, na.rm = TRUE)

# HOUSEHOLD LONGITUDINAL DATASET -----------------------------------------------

# Household tracking -----------------------------------------------------------

nga_track_2010  <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2010", "Post harvest Wave 1", "Household", 
             "secta_harvestw1.dta"),generate.factors = F, convert.factors = F
  ) %>% 
  transmute(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid) %>% 
  mutate(year = 2010,
         rural = as.numeric(rural == 2))

nga_track_2012  <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2012", "Post harvest Wave 2", "Household", 
             "secta_harvestw2.dta"), generate.factors = F, convert.factors = F
  ) %>% 
  transmute(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid) %>% 
  mutate(year = 2012,
         rural = as.numeric(rural == 2))

nga_track_2015  <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2015", "sectaa_harvestw3.dta"),
  generate.factors = F, convert.factors = F
  ) %>% 
  transmute(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid) %>% 
  mutate(year = 2015,
         rural = as.numeric(rural == 2))

# Pooled
nga_pool_1admin <- do.call("rbind", list(nga_track_2010, nga_track_2012, nga_track_2015))
rm(nga_track_2010, nga_track_2012, nga_track_2015)

# Household demographics -------------------------------------------------------

nga_2010_sec1 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2010", "Post Harvest Wave 1", "Household", 
             "sect1_harvestw1.dta"), generate.factors = F, convert.factors = F
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  mutate(
    s1q4 = ifelse(s1q4 == 999, NA, s1q4),
    year = 2010,
    hh_role = s1q3,
    above15 = as.numeric(s1q4 > 15),
    adultage = ifelse(above15 == 1, s1q4, NA),
    adultsex = ifelse(above15 == 1, s1q2, NA)
  ) %>% 
  group_by(hhid, year) %>% 
  summarise(
    hh_size = sum(above15 == 1, na.rm = T),
    hh_avgage = mean(adultage, na.rm = T),
    hh_avgmale = mean(adultsex == "1", na.rm = T),
    hh_headmarr = sum(hh_role == 1 & s1q7 %in% c(1,2))
  ) %>% 
  ungroup() %>% 
  select(year, hhid, hh_size, hh_avgage, hh_avgmale, hh_headmarr)

indiv_2010 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2010", "Post Harvest Wave 1", "Household", 
             "sect1_harvestw1.dta"), generate.factors = F, convert.factors = F
) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  mutate(
    s1q4 = ifelse(s1q4 == 999, NA, s1q4)) %>% 
  transmute(
    hhid = hhid,
    indiv = paste0(hhid,indiv),
    above15 = as.numeric(s1q4 > 15),
    above15 = ifelse(is.na(above15), 0, above15),
    female = case_when(
      s1q2 == 2 ~ 1,
      s1q2 == 1 ~ 0,
      TRUE ~ NA_real_)
  )

nga_2012_sec1 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2012", "Post Harvest Wave 2", "Household", 
             "sect1_harvestw2.dta"), generate.factors = F, convert.factors = F,
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  transmute(
    hhid = hhid,
    s1q4 = ifelse(s1q4 == 999, NA, s1q4),
    s1q7 = s1q7,
    s1q28 = s1q28,
    year = 2012,
    hh_role = s1q3,
    above15 = as.numeric(s1q4 > 15),
    # still a member of the household OR a new household member (NA)
    stillmember = as.numeric(s1q14 == 1 | is.na(s1q14)),
    adultage = ifelse(above15 == 1 & stillmember == 1, s1q4, NA),
    adultsex = ifelse(above15 == 1 & stillmember == 1, s1q2, NA)
  ) %>% 
  group_by(hhid, year) %>% 
  summarise(
    hh_size = sum(above15 == 1 & stillmember == 1, na.rm = T),
    hh_avgage = mean(adultage, na.rm = T),
    hh_avgmale = mean(adultsex == "1" & stillmember == 1, na.rm = T),
    hh_headmarr = sum(hh_role == 1 & s1q7 %in% c(1,2))
  ) %>% 
  ungroup() %>% 
  select(year, hhid, hh_size, hh_avgage, hh_avgmale, hh_headmarr)

indiv_2012 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2012", "Post Harvest Wave 2", "Household", "sect1_harvestw2.dta"), 
  generate.factors = F, convert.factors = F
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  mutate(s1q4 = ifelse(s1q4 > 200, NA, s1q4)) %>% 
  transmute(
    hhid = hhid,
    indiv = paste0(hhid,indiv),
    above15 = as.numeric(s1q4 > 15),
    above15 = ifelse(is.na(above15), 0, above15),
    female = case_when(
      s1q2 == 2 ~ 1,
      s1q2 == 1 ~ 0,
      TRUE ~ NA_real_),
    # still a member of the household OR a new household member (NA)
    stillmember = as.numeric(s1q14 == 1 | is.na(s1q14)),
  )

nga_2015_sec1 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2015", "sect1_harvestw3.dta"), 
  generate.factors = F, convert.factors = F
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  transmute(
    hhid = hhid,
    s1q4 = ifelse(s1q4 == 999, NA, s1q4),
    s1q7 = s1q7,
    s1q28 = s1q28,
    year = 2015,
    hh_role = s1q3,
    above15 = as.numeric(s1q4 > 15),
    # still a member of the household OR a new household member
    stillmember = as.numeric(s1q4a == 1),
    adultage = ifelse(above15 == 1 & stillmember == 1, s1q4, NA),
    adultsex = ifelse(above15 == 1 & stillmember == 1, s1q2, NA)
  ) %>% 
  group_by(hhid, year) %>% 
  summarise(
    hh_size = sum(above15 == 1 & stillmember == 1, na.rm = T),
    hh_avgage = mean(adultage, na.rm = T),
    hh_avgmale = mean(adultsex == "1" & stillmember == 1, na.rm = T),
    hh_headmarr = sum(hh_role == 1 & s1q7 %in% c(1,2))
  ) %>% 
  ungroup() %>% 
  select(year, hhid, hh_size, hh_avgage, hh_avgmale, hh_headmarr)

indiv_2015 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2015", "sect1_harvestw3.dta"), 
  generate.factors = F, convert.factors = F) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  mutate(s1q4 = ifelse(s1q4 > 200, NA, s1q4)) %>% 
  transmute(
    hhid = hhid,
    indiv = paste0(hhid,indiv),
    above15 = as.numeric(s1q4 > 15),
    above15 = ifelse(is.na(above15), 0, above15),
    female = case_when(
      s1q2 == 2 ~ 1,
      s1q2 == 1 ~ 0,
      TRUE ~ NA_real_),
    # still a member of the household OR a new household member
    stillmember = as.numeric(s1q4a == 1)
  )

# Pooled
nga_pool_2dem <- do.call("rbind", list(nga_2010_sec1, nga_2012_sec1, nga_2015_sec1))
rm(nga_2010_sec1, nga_2012_sec1, nga_2015_sec1)

# Household highest years of education -----------------------------------------

nga_2010_sec2 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2010", "Post Planting Wave 1", "Household", 
             "sect2_plantingw1.dta"), generate.factors = F, convert.factors = F
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  mutate(indiv = paste0(hhid, indiv)) %>% 
  left_join(indiv_2010) %>% 
  filter(above15 == 1) %>% 
  transmute(
    hhid,
    hh_educ = case_when(
      s2q8 %in% c(1) ~ 0,
      s2q8 %in% c(2, 3, 4, 13) ~ 6,
      s2q8 %in% c(5) ~ 9,
      s2q8 %in% c(6, 7, 8) ~ 12,
      s2q8 %in% c(9, 10) ~ 15,
      s2q8 %in% c(11) ~ 17,
      s2q8 %in% c(12) ~ 20,
      TRUE ~ 0)
  ) %>% 
  group_by(hhid) %>% 
  summarise(hh_educ = max(hh_educ, na.rm = T)) %>% 
  mutate(year = 2010,
         hh_educ = ifelse(is.infinite(hh_educ), NA, hh_educ)
         ) %>% 
  ungroup()

nga_2012_sec2 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2012", "Post Planting Wave 2", "Household", 
             "sect2_plantingw2.dta"), generate.factors = F, convert.factors = F
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  mutate(indiv = paste0(hhid, indiv)) %>% 
  left_join(indiv_2012) %>% 
  filter(above15 == 1, stillmember == 1) %>% 
  mutate(
    hh_educ = case_when(
      s2q9 %in% c(1) ~ 0,
      s2q9 %in% c(2, 3, 4, 13) ~ 6,
      s2q9 %in% c(5) ~ 9,
      s2q9 %in% c(6, 7, 8) ~ 12,
      s2q9 %in% c(9, 10) ~ 15,
      s2q9 %in% c(11) ~ 17,
      s2q9 %in% c(12) ~ 20,
      TRUE ~ 0)
  ) %>% 
  group_by(hhid) %>% 
  summarise(hh_educ = max(hh_educ, na.rm = T)) %>% 
  mutate(year = 2012,
         hh_educ = ifelse(is.infinite(hh_educ), NA, hh_educ)
  ) %>% 
  ungroup()

nga_2015_sec2 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2015", "sect2_harvestw3.dta"), 
  generate.factors = F, convert.factors = F
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  mutate(indiv = paste0(hhid, indiv)) %>% 
  left_join(indiv_2015) %>% 
  filter(above15 == 1, stillmember == 1) %>% 
  mutate(
    hh_educ = case_when(
      s2aq10 %in% c(1) ~ 0,
      s2aq10 %in% c(2, 3, 4, 13) ~ 6,
      s2aq10 %in% c(5) ~ 9,
      s2aq10 %in% c(6, 7, 8) ~ 12,
      s2aq10 %in% c(9, 10) ~ 15,
      s2aq10 %in% c(11) ~ 17,
      s2aq10 %in% c(12) ~ 20,
      TRUE ~ 0)
  ) %>% 
  group_by(hhid) %>% 
  summarise(hh_educ = max(hh_educ, na.rm = T)) %>% 
  mutate(year = 2015,
         hh_educ = ifelse(is.infinite(hh_educ), NA, hh_educ)
  ) %>% 
  ungroup()

# Pooled
nga_pool_3educ <- do.call("rbind", list(nga_2010_sec2, nga_2012_sec2, nga_2015_sec2))
rm(nga_2010_sec2, nga_2012_sec2, nga_2015_sec2)

# Household assets -------------------------------------------------------------

assets <- tibble::tribble(
            ~item_cd,       ~asset,
                301L,       "asset_Sofa",
                302L,     "asset_Chairs",
                303L,     "asset_Tables",
                304L,   "asset_Mattress",
                305L,        "asset_Bed",
                306L,        "asset_Mat",
                317L,    "asset_Bicycle",
                318L,  "asset_Motorbike",
                319L,        "asset_Car"
            )

nga_2010_sec7 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2010", "Post Harvest Wave 1", "Household", 
             "sect7_harvestw1.dta"), generate.factors = F, convert.factors = F
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  filter(item_cd %in% c(301, 302, 303, 304, 305, 306, 317, 318, 319)) %>%  
  left_join(assets) %>% 
  transmute(hhid = hhid, asset = asset, own = as.numeric(s7x == "X" | s7q5 == 1)
  ) %>% 
  pivot_wider(names_from = asset, values_from = own) %>% 
  mutate_all(~ifelse(is.na(.), 0, .)) %>% 
  mutate(year = 2010)

nga_2012_sec7 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2012", "Post Harvest Wave 2", "Household", 
             "sect7_harvestw2.dta"), generate.factors = F, convert.factors = F
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  filter(item_cd %in% c(301, 302, 303, 304, 305, 306, 317, 318, 319)) %>%  
  left_join(assets) %>% 
  transmute(hhid = hhid, asset = asset, own = as.numeric(s7 > 0 | s7q5 == 1)
  ) %>% 
  pivot_wider(names_from = asset, values_from = own) %>% 
  mutate_all(~ifelse(is.na(.), 0, .)) %>% 
  mutate(year = 2012)

nga_2015_sec7 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2015", "sect5_plantingw3.dta"), 
  generate.factors = F, convert.factors = F
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  filter(item_cd %in% c(301, 302, 303, 304, 305, 306, 317, 318, 319)) %>%  
  left_join(assets) %>% 
  transmute(hhid = hhid, asset = asset, own = as.numeric(s5q1 > 0)
  ) %>% 
  pivot_wider(names_from = asset, values_from = own) %>% 
  mutate_all(~ifelse(is.na(.), 0, .)) %>% 
  mutate(year = 2015)

# Panel
nga_pool_5asset <- do.call("rbind", list(nga_2010_sec7, nga_2012_sec7, nga_2015_sec7))
rm(nga_2010_sec7, nga_2012_sec7, nga_2015_sec7)

# Household electricity source -------------------------------------------------
nga_2010_sec8 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2010", "Post Harvest Wave 1", "Household", 
             "sect8_harvestw1.dta"), generate.factors = F, convert.factors = F
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  transmute(
    hhid = hhid,
    year = 2010,
    el_grid = as.numeric(s8q19 %in% c(1,2,4,5)),
    el_dgen = as.numeric(s8q19 %in% c(3,4,5)),
    com_grid = ifelse(s8q26 == 1 | el_grid == 1, 1,0),
    hh_rooms = s8q9) %>% 
  ungroup()

nga_2012_sec8 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2012", "Post Harvest Wave 2", "Household", 
             "sect8_harvestw2.dta"), generate.factors = F, convert.factors = F
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  transmute(
    hhid = hhid,
    year = 2012,
    el_grid = as.numeric(s8q19 %in% c(1,2,4,5)),
    el_dgen = as.numeric(s8q19 %in% c(3,4,5)),
    com_grid = ifelse(s8q26 == 1 | el_grid == 1, 1,0),
    hh_rooms = s8q9) %>% 
  ungroup()

nga_2015_sec8 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2015", "sect11_plantingw3.dta"), 
  generate.factors = F, convert.factors = F
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  transmute(
    hhid = hhid,
    year = 2015,
    el_grid = as.numeric(s11q19b %in% c(1,2)),
    el_dgen = as.numeric(s11q28b == 1),
    com_grid = ifelse(s11q26 == 1 | el_grid == 1, 1,0),
    hh_rooms = s11q9) %>% 
  ungroup()

# Panel
nga_pool_6elec <- do.call("rbind", list(nga_2010_sec8, nga_2012_sec8, nga_2015_sec8))
rm(nga_2010_sec8, nga_2012_sec8, nga_2015_sec8)

# Household non-farm entrepreneurship ------------------------------------------

sex_2010 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2010", "Post Harvest Wave 1", "Household", 
             "sect1_harvestw1.dta"), generate.factors = F, convert.factors = F
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  transmute(
    hhid = hhid,
    female = as.numeric(s1q2 == 2),
    female = ifelse(is.na(female), 0, female)
  ) %>% 
  group_by(hhid) %>% 
  summarise(female_vec = paste(female, collapse = '')
  )

nga_2010_sec9 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2010", "Post Harvest Wave 1", "Household", 
             "sect9_harvestw1.dta"), generate.factors = F, convert.factors = F,
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  left_join(sex_2010) %>% 
  group_by(hhid) %>% 
  transmute(
    # Any enterprise not permanently closed
    any = as.numeric(s9q3 != 2 & !is.na(s9q3)),
    # Retail enterprises not permanently closed
    retail = as.numeric(s9q1a %in% 45:47 & s9q3 != 2 & !is.na(s9q3)),
    # Manufacturing enterprises not permanently closed
    manuf = as.numeric(s9q1a %in% 10:33 & s9q3 != 2 & !is.na(s9q3)),
    # NFE Capital stock and set to 0 if NA (or implicit NA)
    capstock = as.numeric(s9q24),
    capstock = ifelse(is.na(capstock) | capstock == 99999998, 0, capstock),
    firstowner = s9q5a,
    secondowner = s9q5b,
    nfe_wmn = case_when(
      substr(female_vec,firstowner,firstowner) == 1 |
        substr(female_vec,secondowner,secondowner) == 1 ~ 1,
      TRUE ~ 0)
    ) %>% 
  group_by(hhid) %>% 
  # Determine total NFEs (+ retail / manuf) with at least some reported capital stock
  summarise(nfe_tot = sum(capstock > 0),
            nfe_retail = sum((retail*capstock) > 0),
            nfe_manuf = sum((manuf*capstock) > 0),
            nfe_wmntot = sum(any == 1 & nfe_wmn == 1 & capstock > 0),
            nfe_wmnretail = sum(as.numeric(retail == 1 & nfe_wmn == 1 & capstock > 0)),
            nfe_wmnmanuf = sum(as.numeric(manuf == 1 & nfe_wmn == 1 & capstock > 0))
            ) %>% 
  mutate(year = 2010) %>% 
  ungroup()

sex_2012 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2012", "Post Harvest Wave 2", "Household", 
             "sect1_harvestw2.dta"), generate.factors = F, convert.factors = F
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  transmute(
    hhid = hhid,
    female = as.numeric(s1q2 == 2),
    female = ifelse(is.na(female), 0, female)
  ) %>% 
  group_by(hhid) %>% 
  summarise(female_vec = paste(female, collapse = '')
  )

nga_2012_sec9 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2012", "Post Harvest Wave 2", "Household", 
             "sect9_harvestw2.dta"), generate.factors = F, convert.factors = F
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  left_join(sex_2012) %>% 
  group_by(hhid) %>% 
  transmute(
    # Any enterprise not permanently closed
    any = as.numeric(s9q3 != 2 & !is.na(s9q3)),
    # Retail enterprises not permanently closed
    retail = as.numeric(s9q1b %in% 45:47 & s9q3 != 2 & !is.na(s9q3)),
    # Manufacturing enterprises not permanently closed
    manuf = as.numeric(s9q1b %in% 10:33 & s9q3 != 2 & !is.na(s9q3)),
    # NFE Capital stock and set to 0 if NA (or implicit NA)
    capstock = as.numeric(s9q24),
    capstock = ifelse(is.na(capstock) | capstock == 99999998, 0, capstock),
    firstowner = s9q5a1,
    secondowner = s9q5a2,
    nfe_wmn = case_when(
      substr(female_vec,firstowner,firstowner) == 1 |
        substr(female_vec,secondowner,secondowner) == 1 ~ 1,
      TRUE ~ 0)
  ) %>% 
  group_by(hhid) %>% 
  # Determine total NFEs (+ retail / manuf) with at least some reported capital stock
  summarise(nfe_tot = sum(capstock > 0),
            nfe_retail = sum((retail*capstock) > 0),
            nfe_manuf = sum((manuf*capstock) > 0),
            nfe_wmntot = sum(any == 1 & nfe_wmn == 1 & capstock > 0),
            nfe_wmnretail = sum(as.numeric(retail == 1 & nfe_wmn == 1 & capstock > 0)),
            nfe_wmnmanuf = sum(as.numeric(manuf == 1 & nfe_wmn == 1 & capstock > 0))
  ) %>% 
  mutate(year = 2012) %>% 
  ungroup()

sex_2015 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2015", "sect1_harvestw3.dta"), 
  generate.factors = F, convert.factors = F
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  transmute(
    hhid = hhid,
    female = as.numeric(s1q2 == 2),
    female = ifelse(is.na(female), 0, female)
  ) %>% 
  group_by(hhid) %>% 
  summarise(female_vec = paste(female, collapse = '')
  )

nga_2015_sec9 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2015", "sect9_harvestw3.dta"), 
  generate.factors = F, convert.factors = F
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  left_join(sex_2015) %>% 
  group_by(hhid) %>% 
  transmute(
    # Any enterprise not permanently closed
    any = as.numeric(s9q3 != 2 & !is.na(s9q3)),
    # Retail enterprises not permanently closed
    retail = as.numeric(s9q1b %in% 45:47 & s9q3 != 2 & !is.na(s9q3)),
    # Manufacturing enterprises not permanently closed
    manuf = as.numeric(s9q1b %in% 10:33 & s9q3 != 2 & !is.na(s9q3)),
    # NFE Capital stock and set to 0 if NA (or implicit NA)
    capstock = as.numeric(s9q24),
    capstock = ifelse(is.na(capstock) | capstock == 99999998, 0, capstock),
    firstowner = s9q5a1,
    secondowner = s9q5a2,
    nfe_wmn = case_when(
      substr(female_vec,firstowner,firstowner) == 1 |
        substr(female_vec,secondowner,secondowner) == 1 ~ 1,
      TRUE ~ 0)
  ) %>% 
  group_by(hhid) %>% 
  # Determine total NFEs (+ retail / manuf) with at least some reported capital stock
  summarise(nfe_tot = sum(capstock > 0),
            nfe_retail = sum((retail*capstock) > 0),
            nfe_manuf = sum((manuf*capstock) > 0),
            nfe_wmntot = sum(any == 1 & nfe_wmn == 1 & capstock > 0),
            nfe_wmnretail = sum(as.numeric(retail == 1 & nfe_wmn == 1 & capstock > 0)),
            nfe_wmnmanuf = sum(as.numeric(manuf == 1 & nfe_wmn == 1 & capstock > 0))
  ) %>% 
  mutate(year = 2015) %>% 
  ungroup()

# Panel
nga_pool_7nfe <- do.call("rbind", list(nga_2010_sec9, nga_2012_sec9, nga_2015_sec9))
rm(nga_2010_sec9, nga_2012_sec9, nga_2015_sec9, sex_2010, 
   sex_2012, sex_2015)

# Household geovariables -------------------------------------------------------

nga_2010_hh_geo <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2010", "Geodata", "NGA_HouseholdGeovariables_Y1.dta"), 
  generate.factors = F, convert.factors = F) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  transmute(
    hhid = hhid,
    year = 2010,
    dist_road = dist_road,
    dist_mkt = dist_market,
    dist_popcenter = dist_popcenter,
    dist_admctr = dist_admctr,
    pct_agr = fsrad3_agpct,
    latitude = lat_dd_mod,
    longitude = lon_dd_mod)

nga_2012_hh_geo <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2012", "Geodata Wave 2", "NGA_HouseholdGeovars_Y2.dta"), 
  generate.factors = F, convert.factors = F) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  transmute(
    hhid = hhid,
    year = 2012,
    dist_road = dist_road2,
    dist_mkt = dist_market,
    dist_popcenter = dist_popcenter2,
    dist_admctr = dist_admctr,
    pct_agr = fsrad3_agpct,
    latitude = LAT_DD_MOD,
    longitude = LON_DD_MOD)


nga_2015_hh_geo <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2015", "NGA_HouseholdGeovars_Y3.dta"), 
  generate.factors = F, convert.factors = F) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         hhid = hhid
  ) %>% 
  transmute(
    hhid = hhid,
    year = 2015,
    dist_road = dist_road2,
    dist_mkt = dist_market,
    dist_popcenter = dist_popcenter2,
    dist_admctr = dist_admctr,
    pct_agr = fsrad3_agpct,
    latitude = LAT_DD_MOD,
    longitude = LON_DD_MOD)

# Cross-section
nga_pool_8geo <- do.call("rbind", list(nga_2010_hh_geo, nga_2012_hh_geo,
                                 nga_2015_hh_geo))
rm(nga_2010_hh_geo, nga_2012_hh_geo,nga_2015_hh_geo)

# HOUSEHOLD LONGITUDINAL DATASET OUTPUT ----------------------------------------

nga_hh_panel <- list(nga_pool_2dem, nga_pool_1admin, nga_pool_3educ,
                     nga_pool_5asset, nga_pool_6elec,
                     nga_pool_7nfe, nga_pool_8geo) %>% 
  reduce(left_join) %>% 
  ungroup() %>% 
  # Remove households where lon-lat or rurality changes between first and last wave
  group_by(hhid) %>% 
  filter(longitude == median(longitude), latitude == median(latitude),
         rural == median(rural)) %>% 
  # As EAs are unreliable, merge by longitude and latide
  mutate(ea = paste0(longitude,"_",latitude))

# Calculate final variables and summarise
nga_hh_panel <- nga_hh_panel %>% 
  # NFE NA values simply artefact of joining (see nfe tibble rows)
  mutate(across(matches("nfe"), ~ifelse(is.na(.),0,.))) %>% 
  group_by(year, ea) %>%
  # if at least 50% of households state that they or the community has grid, we assume the community has grid
  mutate(com_grid = ifelse(mean(com_grid == 1 | el_grid == 1, na.rm = T) >= 0.5, 1, 0)) %>% 
  ungroup() %>% 
  select(year, rural, admin1, admin2, admin3, ea, hhid, com_grid, matches("el"),  matches("hh_emp"), 
         matches("nfe"), everything()) %>% 
  ungroup()

saveRDS(nga_hh_panel, here::here("Data", "nigeria_hhpanel.rds"))

# INDIVUDUAL LONGITUDINAL DATASET ----------------------------------------------

# Individual demographics ------------------------------------------------------

nga_indiv_2010_sec1 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2010", "Post Harvest Wave 1", "Household", 
             "sect1_harvestw1.dta"), generate.factors = F, convert.factors = F
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         ea = paste0(admin1, admin2, admin3, ea), 
         hhid = hhid,
         s1q4 = ifelse(s1q4 == 999, NA, s1q4)
  ) %>% 
  filter(s1q4 > 15) %>% 
  transmute(
    hhid = hhid,
    indiv = paste0(hhid, "_", indiv),
    year = 2010,
    head = as.numeric(s1q3 == 1),
    age = s1q4,
    sexfem = as.numeric(s1q2 == 2),
    mar = as.numeric(s1q7 %in% c(1,2)))

nga_indiv_2012_sec1 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2012", "Post Harvest Wave 2", "Household", 
             "sect1_harvestw2.dta"), generate.factors = F, convert.factors = F,
) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         ea = paste0(admin1, admin2, admin3, ea), 
         hhid = hhid,
         s1q4 = ifelse(s1q4 == 999, NA, s1q4),
         # still a member of the household OR a new household member (NA)
         stillmember = as.numeric(s1q14 == 1 | is.na(s1q14))
  ) %>% 
  filter(s1q4 > 15, stillmember == 1) %>% 
  transmute(
    hhid = hhid,
    indiv = paste0(hhid, "_", indiv),
    year = 2012,
    head = as.numeric(s1q3 == 1),
    age = s1q4,
    sexfem = as.numeric(s1q2 == 2),
    mar = as.numeric(s1q7 %in% c(1,2)))

nga_indiv_2015_sec1 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2015", "sect1_harvestw3.dta"), 
  generate.factors = F, convert.factors = F
) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         ea = paste0(admin1, admin2, admin3, ea), 
         hhid = hhid,
         s1q4 = ifelse(s1q4 == 999, NA, s1q4),
         # still a member of the household OR a new household member (NA)
         stillmember = as.numeric(s1q4a == 1 | is.na(s1q4a))
  ) %>% 
  filter(s1q4 > 15, stillmember == 1) %>% 
  transmute(
    hhid = hhid,
    indiv = paste0(hhid, "_", indiv),
    year = 2015,
    head = as.numeric(s1q3 == 1),
    age = s1q4,
    sexfem = as.numeric(s1q2 == 2),
    mar = as.numeric(s1q7 %in% c(1,2)))

# Pooled
nga_indiv_pool_1dem <- do.call("rbind", list(nga_indiv_2010_sec1, nga_indiv_2012_sec1, nga_indiv_2015_sec1))
rm(nga_indiv_2010_sec1, nga_indiv_2012_sec1, nga_indiv_2015_sec1)

# Individual years of education ------------------------------------------------

nga_indiv_2010_sec2 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2010", "Post Planting Wave 1", "Household", 
             "sect2_plantingw1.dta"), generate.factors = F, convert.factors = F
) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         ea = paste0(admin1, admin2, admin3, ea), 
         hhid = hhid
  ) %>% 
  transmute(
    hhid = hhid,
    indiv = paste0(hhid, "_", indiv),
    year = 2010,
    educ = case_when(
      s2q8 %in% c(1) ~ 0,
      s2q8 %in% c(2, 3, 4, 13) ~ 6,
      s2q8 %in% c(5) ~ 9,
      s2q8 %in% c(6, 7, 8) ~ 12,
      s2q8 %in% c(9, 10) ~ 15,
      s2q8 %in% c(11) ~ 17,
      s2q8 %in% c(12) ~ 20,
      TRUE ~ 0)
  )

nga_indiv_2012_sec2 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2012", "Post Planting Wave 2", "Household", 
             "sect2_plantingw2.dta"), generate.factors = F, convert.factors = F
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         ea = paste0(admin1, admin2, admin3, ea), 
         hhid = hhid
  ) %>% 
  transmute(
    hhid = hhid,
    indiv = paste0(hhid, "_", indiv),
    year = 2012,
    educ = case_when(
      s2q9 %in% c(1) ~ 0,
      s2q9 %in% c(2, 3, 4, 13) ~ 6,
      s2q9 %in% c(5) ~ 9,
      s2q9 %in% c(6, 7, 8) ~ 12,
      s2q9 %in% c(9, 10) ~ 15,
      s2q9 %in% c(11) ~ 17,
      s2q9 %in% c(12) ~ 20,
      TRUE ~ 0)
  ) 

nga_indiv_2015_sec2 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2015", "sect2_harvestw3.dta"), 
  generate.factors = F, convert.factors = F
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         ea = paste0(admin1, admin2, admin3, ea), 
         hhid = hhid
  ) %>% 
  transmute(
    hhid = hhid,
    indiv = paste0(hhid, "_", indiv),
    year = 2015,
    educ = case_when(
      s2aq10 %in% c(1) ~ 0,
      s2aq10 %in% c(2, 3, 4, 13) ~ 6,
      s2aq10 %in% c(5) ~ 9,
      s2aq10 %in% c(6, 7, 8) ~ 12,
      s2aq10 %in% c(9, 10) ~ 15,
      s2aq10 %in% c(11) ~ 17,
      s2aq10 %in% c(12) ~ 20,
      TRUE ~ 0)
  )

# Pooled
nga_indiv_pool_3educ <- do.call("rbind", list(nga_indiv_2010_sec2, nga_indiv_2012_sec2, nga_indiv_2015_sec2))
rm(nga_indiv_2010_sec2, nga_indiv_2012_sec2, nga_indiv_2015_sec2)


# Individual employment outcomes -----------------------------------------------

nga_indiv_2010_sec3 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2010", "Post Harvest Wave 1", 
             "Household", "sect3a_harvestw1.dta"), generate.factors = F, 
  convert.factors = F
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         ea = paste0(admin1, admin2, admin3, ea), 
         hhid = hhid
  ) %>% 
  transmute(
    hhid = hhid,
    indiv = paste0(hhid, "_", indiv),
    year = 2010,
    emp = ifelse(s3aq4 == 1, 1, 0),
    emp_farm = ifelse(s3aq4 == 1 & (s3aq11 == 1 | s3aq2 == 1), 1, 0),
    emp_nonfarm = ifelse(s3aq4 == 1 & emp_farm != 1, 1, 0)
  )

nga_indiv_2012_sec3 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2012", "Post Harvest Wave 2", "Household", 
             "sect3a_harvestw2.dta"), generate.factors = F, convert.factors = F
  ) %>% 
  mutate(admin1 = zone, 
         admin2 = state, 
         admin3 = lga, 
         rural = sector, 
         ea = paste0(admin1, admin2, admin3, ea), 
         hhid = hhid
  ) %>% 
  transmute(
    hhid = hhid,
    indiv = paste0(hhid, "_", indiv),
    year = 2012,
    emp = ifelse(s3aq4 == 1, 1, 0),
    emp_farm = ifelse(s3aq4 == 1 & (s3aq11 == 1 | s3aq2 == 1), 1, 0),
    emp_nonfarm = ifelse(s3aq4 == 1 & emp_farm != 1, 1, 0)
  )

nga_indiv_2015_sec3 <- readstata13::read.dta13(
  here::here("Data", "Nigeria", "2015", "sect3_harvestw3.dta"), 
  generate.factors = F, convert.factors = F
  ) %>% 
  mutate(sect = case_when(
    is.na(s3q14) & s3q5 == 1 ~ 1, 
    is.na(s3q14) & s3q6 == 1 ~ 0,
    TRUE ~ as.numeric(as.character(s3q14)))
  ) %>% 
  transmute(
    hhid = hhid,
    indiv = paste0(hhid, "_", indiv),
    year = 2015,
    emp = ifelse(s3q7 == 1, 1, 0),
    emp_farm = ifelse(s3q7 == 1 & sect == 1, 1, 0),
    emp_nonfarm = ifelse(s3q7 == 1 & emp_farm != 1, 1, 0)
  )

# Cross-section
nga_indiv_pool_4emp <- do.call("rbind", list(nga_indiv_2010_sec3, nga_indiv_2012_sec3, nga_indiv_2015_sec3))
rm(nga_indiv_2010_sec3, nga_indiv_2012_sec3, nga_indiv_2015_sec3)

# INDIVIDUAL LONGITUDINAL DATASET OUTPUT ---------------------------------------

nga_indiv_panel <- list(nga_indiv_pool_1dem, nga_indiv_pool_3educ, nga_indiv_pool_4emp) %>% 
  # The large amount of "rows only in Y" is due to not filtering these to be adults
  reduce(inner_join) %>% 
  ungroup() %>% 
  # Keep only those households in the main household dataset and merge
  inner_join(nga_hh_panel)

saveRDS(nga_indiv_panel, here::here("Data", "nigeria_indivpanel.rds"))

